转录组分析

Author

Guangchuang Yu
School of Basic Medical Sciences, Southern Medical University

Published

November 11, 2025

实验基于R语言环境,安装R语言,可参考:

示例数据

library(yulab.utils)
pload(airway)
── loading the package: airway ──
dir <- system.file("extdata", package="airway", mustWork=TRUE)
list.files(dir)
 [1] "GSE52778_series_matrix.txt"        "Homo_sapiens.GRCh37.75_subset.gtf"
 [3] "quants"                            "sample_table.csv"                 
 [5] "SraRunInfo_SRP033351.csv"          "SRR1039508_subset.bam"            
 [7] "SRR1039509_subset.bam"             "SRR1039512_subset.bam"            
 [9] "SRR1039513_subset.bam"             "SRR1039516_subset.bam"            
[11] "SRR1039517_subset.bam"             "SRR1039520_subset.bam"            
[13] "SRR1039521_subset.bam"            
list.files(file.path(dir, "quants"))
[1] "SRR1039508" "SRR1039509"
csvfile <- file.path(dir, "sample_table.csv")
coldata <- read.csv(csvfile, row.names=1, stringsAsFactors=FALSE)
coldata
           SampleName    cell   dex albut        Run avgLength Experiment
SRR1039508 GSM1275862  N61311 untrt untrt SRR1039508       126  SRX384345
SRR1039509 GSM1275863  N61311   trt untrt SRR1039509       126  SRX384346
SRR1039512 GSM1275866 N052611 untrt untrt SRR1039512       126  SRX384349
SRR1039513 GSM1275867 N052611   trt untrt SRR1039513        87  SRX384350
SRR1039516 GSM1275870 N080611 untrt untrt SRR1039516       120  SRX384353
SRR1039517 GSM1275871 N080611   trt untrt SRR1039517       126  SRX384354
SRR1039520 GSM1275874 N061011 untrt untrt SRR1039520       101  SRX384357
SRR1039521 GSM1275875 N061011   trt untrt SRR1039521        98  SRX384358
              Sample    BioSample
SRR1039508 SRS508568 SAMN02422669
SRR1039509 SRS508567 SAMN02422675
SRR1039512 SRS508571 SAMN02422678
SRR1039513 SRS508572 SAMN02422670
SRR1039516 SRS508575 SAMN02422682
SRR1039517 SRS508576 SAMN02422673
SRR1039520 SRS508579 SAMN02422683
SRR1039521 SRS508580 SAMN02422677
coldata <- coldata[1:2,]
coldata$names <- coldata$Run
coldata$files <- file.path(dir, "quants", coldata$names, "quant.sf.gz")
file.exists(coldata$files)
[1] TRUE TRUE
coldata
           SampleName   cell   dex albut        Run avgLength Experiment
SRR1039508 GSM1275862 N61311 untrt untrt SRR1039508       126  SRX384345
SRR1039509 GSM1275863 N61311   trt untrt SRR1039509       126  SRX384346
              Sample    BioSample      names
SRR1039508 SRS508568 SAMN02422669 SRR1039508
SRR1039509 SRS508567 SAMN02422675 SRR1039509
                                                                            files
SRR1039508 E:/software-data/RLibrary/airway/extdata/quants/SRR1039508/quant.sf.gz
SRR1039509 E:/software-data/RLibrary/airway/extdata/quants/SRR1039509/quant.sf.gz

定量

pload("tximeta")
se <- tximeta(coldata)
dim(se)
head(rownames(se))
gse <- summarizeToGene(se)
dim(gse)

上面的代码运行时间较长,可以直接加载airway包中的示例数据gse

data(gse, package="airway")
gse
class: RangedSummarizedExperiment 
dim: 58294 8 
metadata(6): tximetaInfo quantInfo ... txomeInfo txdbInfo
assays(3): counts abundance length
rownames(58294): ENSG00000000003.14 ENSG00000000005.5 ...
  ENSG00000285993.1 ENSG00000285994.1
rowData names(1): gene_id
colnames(8): SRR1039508 SRR1039509 ... SRR1039520 SRR1039521
colData names(3): names donor condition

访问相应的信息

pload(SummarizedExperiment)
── loading the package: SummarizedExperiment ──
head(rownames(gse))
[1] "ENSG00000000003.14" "ENSG00000000005.5"  "ENSG00000000419.12"
[4] "ENSG00000000457.13" "ENSG00000000460.16" "ENSG00000000938.12"
assayNames(gse)
[1] "counts"    "abundance" "length"   
head(assay(gse), 3)
                   SRR1039508 SRR1039509 SRR1039512 SRR1039513 SRR1039516
ENSG00000000003.14    708.164    467.962    900.992    424.368   1188.295
ENSG00000000005.5       0.000      0.000      0.000      0.000      0.000
ENSG00000000419.12    455.000    510.000    604.000    352.000    583.000
                   SRR1039517 SRR1039520 SRR1039521
ENSG00000000003.14   1090.668    805.929    599.337
ENSG00000000005.5       0.000      0.000      0.000
ENSG00000000419.12    773.999    409.999    499.000
rowRanges(gse)
GRanges object with 58294 ranges and 1 metadata column:
                     seqnames              ranges strand |            gene_id
                        <Rle>           <IRanges>  <Rle> |        <character>
  ENSG00000000003.14     chrX 100627109-100639991      - | ENSG00000000003.14
   ENSG00000000005.5     chrX 100584802-100599885      + |  ENSG00000000005.5
  ENSG00000000419.12    chr20   50934867-50958555      - | ENSG00000000419.12
  ENSG00000000457.13     chr1 169849631-169894267      - | ENSG00000000457.13
  ENSG00000000460.16     chr1 169662007-169854080      + | ENSG00000000460.16
                 ...      ...                 ...    ... .                ...
   ENSG00000285990.1    chr14   19244904-19269380      - |  ENSG00000285990.1
   ENSG00000285991.1     chr6 149817937-149896011      - |  ENSG00000285991.1
   ENSG00000285992.1     chr8   47129262-47132628      + |  ENSG00000285992.1
   ENSG00000285993.1    chr18   46409197-46410645      - |  ENSG00000285993.1
   ENSG00000285994.1    chr10   12563151-12567351      + |  ENSG00000285994.1
  -------
  seqinfo: 25 sequences (1 circular) from hg38 genome
seqinfo(rowRanges(gse))
Seqinfo object with 25 sequences (1 circular) from hg38 genome:
  seqnames seqlengths isCircular genome
  chr1      248956422      FALSE   hg38
  chr2      242193529      FALSE   hg38
  chr3      198295559      FALSE   hg38
  chr4      190214555      FALSE   hg38
  chr5      181538259      FALSE   hg38
  ...             ...        ...    ...
  chr21      46709983      FALSE   hg38
  chr22      50818468      FALSE   hg38
  chrX      156040895      FALSE   hg38
  chrY       57227415      FALSE   hg38
  chrM          16569       TRUE   hg38
colData(gse)
DataFrame with 8 rows and 3 columns
                names    donor     condition
             <factor> <factor>      <factor>
SRR1039508 SRR1039508  N61311  Untreated    
SRR1039509 SRR1039509  N61311  Dexamethasone
SRR1039512 SRR1039512  N052611 Untreated    
SRR1039513 SRR1039513  N052611 Dexamethasone
SRR1039516 SRR1039516  N080611 Untreated    
SRR1039517 SRR1039517  N080611 Dexamethasone
SRR1039520 SRR1039520  N061011 Untreated    
SRR1039521 SRR1039521  N061011 Dexamethasone

差异分析

创建DESeqDataSet对象

pload(DESeq2)
── loading the package: DESeq2 ──
dds <- DESeqDataSet(gse, design = ~ donor + condition)
using counts and average transcript lengths from tximeta
hist(log(rowSums(counts(dds))))

nrow(dds)
[1] 58294
keep <- rowSums(counts(dds)) > 1
dds <- dds[keep, ]
nrow(dds)
[1] 31604

PCA

pload(glmpca)
── loading the package: glmpca ──
gpca <- glmpca(counts(dds), L=2)
gpca.dat <- gpca$factors
gpca.dat$condition <- dds$condition
gpca.dat$donor <- dds$donor
head(gpca.dat)
                dim1       dim2     condition   donor
SRR1039508  60.05741  22.733597     Untreated  N61311
SRR1039509 -34.09034   6.874379 Dexamethasone  N61311
SRR1039512  34.26431  34.774115     Untreated N052611
SRR1039513 -65.38916  15.653289 Dexamethasone N052611
SRR1039516  69.47256 -61.693976     Untreated N080611
SRR1039517 -35.37449 -98.924087 Dexamethasone N080611
pload(ggplot2)
── loading the package: ggplot2 ──
ggplot(gpca.dat, aes(x = dim1, y = dim2, 
                    color = condition, shape = donor)) +
  geom_point(size =3) + 
  ggtitle("glmpca - Generalized PCA")

差异表达分析

dds <- DESeq(dds)
estimating size factors
using 'avgTxLength' from assays(dds), correcting for library size
estimating dispersions
gene-wise dispersion estimates
mean-dispersion relationship
final dispersion estimates
fitting model and testing
res <- results(dds)
res
log2 fold change (MLE): condition Dexamethasone vs Untreated 
Wald test p-value: condition Dexamethasone vs Untreated 
DataFrame with 31604 rows and 6 columns
                     baseMean log2FoldChange     lfcSE      stat      pvalue
                    <numeric>      <numeric> <numeric> <numeric>   <numeric>
ENSG00000000003.14 739.940717     -0.3611537  0.106869 -3.379419 0.000726392
ENSG00000000419.12 511.735722      0.2063147  0.128665  1.603509 0.108822318
ENSG00000000457.13 314.194855      0.0378308  0.158633  0.238479 0.811509461
ENSG00000000460.16  79.793622     -0.1152590  0.314991 -0.365912 0.714430444
ENSG00000000938.12   0.307267     -1.3691185  3.503764 -0.390757 0.695977205
...                       ...            ...       ...       ...         ...
ENSG00000285979.1   38.353886      0.3423657  0.359511  0.952310    0.340940
ENSG00000285987.1    1.562508      0.7064145  1.547295  0.456548    0.647996
ENSG00000285990.1    0.642315      0.3647333  3.433276  0.106235    0.915396
ENSG00000285991.1   11.276284     -0.1165515  0.748601 -0.155692    0.876275
ENSG00000285994.1    3.651041     -0.0960094  1.068660 -0.089841    0.928414
                         padj
                    <numeric>
ENSG00000000003.14 0.00531137
ENSG00000000419.12 0.29318870
ENSG00000000457.13 0.92255697
ENSG00000000460.16 0.87298038
ENSG00000000938.12         NA
...                       ...
ENSG00000285979.1    0.600750
ENSG00000285987.1          NA
ENSG00000285990.1          NA
ENSG00000285991.1    0.952921
ENSG00000285994.1          NA
summary(res)

out of 31604 with nonzero total read count
adjusted p-value < 0.1
LFC > 0 (up)       : 2373, 7.5%
LFC < 0 (down)     : 1949, 6.2%
outliers [1]       : 0, 0%
low counts [2]     : 14706, 47%
(mean count < 9)
[1] see 'cooksCutoff' argument of ?results
[2] see 'independentFiltering' argument of ?results

火山图

pload(org.Hs.eg.db)
── loading the package: org.Hs.eg.db ──
df <- as.data.frame(res)
df$gene_id <- sub("\\.\\w+$", "", rownames(df))
df$symbol <- mapIds(org.Hs.eg.db, 
                    keys = df$gene_id,
                    column = "SYMBOL", 
                    keytype = "ENSEMBL", 
                    multiVals = "first")
'select()' returned 1:many mapping between keys and columns
head(df)
                       baseMean log2FoldChange      lfcSE       stat
ENSG00000000003.14  739.9407174    -0.36115368 0.10686857 -3.3794190
ENSG00000000419.12  511.7357220     0.20631474 0.12866453  1.6035090
ENSG00000000457.13  314.1948551     0.03783077 0.15863341  0.2384792
ENSG00000000460.16   79.7936215    -0.11525899 0.31499067 -0.3659124
ENSG00000000938.12    0.3072668    -1.36911852 3.50376359 -0.3907565
ENSG00000000971.15 5716.7968219     0.44708633 0.08938729  5.0016766
                         pvalue         padj         gene_id symbol
ENSG00000000003.14 7.263921e-04 5.311369e-03 ENSG00000000003 TSPAN6
ENSG00000000419.12 1.088223e-01 2.931887e-01 ENSG00000000419   DPM1
ENSG00000000457.13 8.115095e-01 9.225570e-01 ENSG00000000457  SCYL3
ENSG00000000460.16 7.144304e-01 8.729804e-01 ENSG00000000460  FIRRM
ENSG00000000938.12 6.959772e-01           NA ENSG00000000938    FGR
ENSG00000000971.15 5.683387e-07 9.085892e-06 ENSG00000000971    CFH
pload(ivolcano)
── loading the package: ivolcano ──
ivolcano(df,
        logFC_col = "log2FoldChange",
        pval_col = "padj",
        gene_col = "symbol",
        top_n = 5,
        onclick_fun=onclick_genecards)

基因表达水平

resSig <- subset(res, padj < 0.01)
topGene <- rownames(res)[which.min(res$padj)]
topGene
[1] "ENSG00000189221.9"
geneCounts <- plotCounts(dds, gene = topGene, 
                intgroup = c("donor", "condition"), 
                returnData = TRUE)
geneCounts
               count   donor     condition
SRR1039508  445.0772  N61311     Untreated
SRR1039509 4570.3817  N61311 Dexamethasone
SRR1039512  468.9803 N052611     Untreated
SRR1039513 4398.0477 N052611 Dexamethasone
SRR1039516  517.5761 N080611     Untreated
SRR1039517 4607.8726 N080611 Dexamethasone
SRR1039520  253.5867 N061011     Untreated
SRR1039521 3732.9201 N061011 Dexamethasone
ggplot(geneCounts, aes(condition, count)) + 
    geom_boxplot(aes(fill=condition), alpha=.3) +
    geom_point(aes(color=donor)) + 
    geom_line(aes(group=donor, color=donor)) +
    scale_y_log10()

差异基因热图

rlog_expr <- assay(rlog(dds))
mat1 <- rlog_expr[order(res$padj)[1:20], ]

pload(dplyr)
── loading the package: dplyr ──
res2 <- as.data.frame(res)
res2$ID <- rownames(res2)
de <- arrange(res2, padj) |>
    group_by(sign(log2FoldChange)) |>
    slice(1:20) |>
    pull(ID)

mat2 <- rlog_expr[de, ]

anno <- as.data.frame(colData(dds)[, c("donor", "condition")])


plot_heatmap <- function(mat, anno) {
    mat <- mat - rowMeans(mat)
    p <- pheatmap::pheatmap(mat, annotation_col = anno)
    return(p)
}

p1 <- plot_heatmap(mat1, anno)
p2 <- plot_heatmap(mat2, anno)
aplot::plot_list(p1, p2)

富集分析

df$entrez <- mapIds(org.Hs.eg.db, 
                    keys = df$gene_id,
                    column = "ENTREZID", 
                    keytype = "ENSEMBL", 
                    multiVals = "first")
'select()' returned 1:many mapping between keys and columns
# resSig <- subset(df, padj < 0.01)
resSig <- df[order(df$padj)[1:100], ]

library(clusterProfiler)
x <- enrichGO(resSig$entrez, OrgDb = org.Hs.eg.db, ont="BP", readable=TRUE)
dotplot(x)

fc <- resSig$log2FoldChange
names(fc) <- resSig$symbol
cnetplot(x, foldChange=fc)

Bonus: 基因信息检索

pload(fanyi)
── loading the package: fanyi ──
fanyi v0.0.8 Learn more at https://yulab-smu.top/

Please cite:

D Wang, G Chen, L Li, S Wen, Z Xie, X Luo, L Zhan, S Xu, J Li, R
Wang, Q Wang, G Yu. Reducing language barriers, promoting information
absorption, and communication using fanyi. Chinese Medical Journal. 2024,
137(16):1950-1956. doi: 10.1097/CM9.0000000000003242
gs <- gene_summary(df$entrez[1:10])
gs$summary2 <- translate(gs$summary)

head(gs)
         uid   name                                                 description
TSPAN6  7105 TSPAN6                                               tetraspanin 6
DPM1    8813   DPM1 dolichyl-phosphate mannosyltransferase subunit 1, catalytic
SCYL3  57147  SCYL3                                    SCY1 like pseudokinase 3
FIRRM  55732  FIRRM   FIGNL1 interacting regulator of recombination and mitosis
FGR     2268    FGR              FGR proto-oncogene, Src family tyrosine kinase
CFH     3075    CFH                                         complement factor H
                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                  summary
TSPAN6 The protein encoded by this gene is a member of the transmembrane 4 superfamily, also known as the tetraspanin family. Most of these members are cell-surface proteins that are characterized by the presence of four hydrophobic domains. The proteins mediate signal transduction events that play a role in the regulation of cell development, activation, growth and motility. The protein encoded by this gene is a cell surface glycoprotein and is highly similar in sequence to the transmembrane 4 superfamily member 2 protein. It functions as a negative regulator of retinoic acid-inducible gene I-like receptor-mediated immune signaling via its interaction with the mitochondrial antiviral signaling-centered signalosome. This gene uses alternative polyadenylation sites, and multiple transcript variants result from alternative splicing. [provided by RefSeq, Jul 2013]
DPM1                                                                                                                                                                                                                         Dolichol-phosphate mannose (Dol-P-Man) serves as a donor of mannosyl residues on the lumenal side of the endoplasmic reticulum (ER). Lack of Dol-P-Man results in defective surface expression of GPI-anchored proteins. Dol-P-Man is synthesized from GDP-mannose and dolichol-phosphate on the cytosolic side of the ER by the enzyme dolichyl-phosphate mannosyltransferase. Human DPM1 lacks a carboxy-terminal transmembrane domain and signal sequence and is regulated by DPM2. Mutations in this gene are associated with congenital disorder of glycosylation type Ie. Alternative splicing results in multiple transcript variants. [provided by RefSeq, Nov 2015]
SCYL3                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                              This gene encodes a protein with a kinase domain and four HEAT repeats. The encoded protein interacts with the C-terminal domain of ezrin, an ERM protein, and may play a role in cell adhesion and migration. Alternative splicing results in multiple transcript variants encoding multiple isoforms. [provided by RefSeq, Jun 2012]
FIRRM                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                             Enables protein kinase binding activity. Involved in several processes, including chromosome segregation; interstrand cross-link repair; and regulation of protein kinase activity. Located in several cellular components, including midbody; nuclear lumen; and spindle midzone. [provided by Alliance of Genome Resources, Jul 2025]
FGR                                                                                                                                                                  This gene is a member of the Src family of protein tyrosine kinases (PTKs). The encoded protein contains N-terminal sites for myristylation and palmitylation, a PTK domain, and SH2 and SH3 domains which are involved in mediating protein-protein interactions with phosphotyrosine-containing and proline-rich motifs, respectively. The protein localizes to plasma membrane ruffles, and functions as a negative regulator of cell migration and adhesion triggered by the beta-2 integrin signal transduction pathway. Infection with Epstein-Barr virus results in the overexpression of this gene. Multiple alternatively spliced variants, encoding the same protein, have been identified. [provided by RefSeq, Jul 2008]
CFH                                                                                                                                                                                                                                                                                        This gene is a member of the Regulator of Complement Activation (RCA) gene cluster and encodes a protein with twenty short consensus repeat (SCR) domains. This protein is secreted into the bloodstream and has an essential role in the regulation of complement activation, restricting this innate defense mechanism to microbial infections. Mutations in this gene have been associated with hemolytic-uremic syndrome (HUS) and chronic hypocomplementemic nephropathy. Alternate transcriptional splice variants, encoding different isoforms, have been characterized. [provided by RefSeq, Oct 2011]
                                                                                                                                                                                                                                                                                                                                                                                                                                                                                       summary2
TSPAN6 该基因编码的蛋白质是跨膜4超家族的成员,也称为tetraspanin家族。这些成员中的大多数是细胞表面蛋白,其特征是存在四个疏水结构域。这些蛋白质介导信号转导事件,在细胞发育、活化、生长和运动的调节中发挥作用。该基因编码的蛋白质是细胞表面糖蛋白,在序列上与跨膜4超家族成员2蛋白质高度相似。它通过与以线粒体抗病毒信号为中心的信号体相互作用,作为视黄酸诱导基因I样受体介导的免疫信号传导的负调节因子。该基因使用选择性多聚腺苷酸化位点,选择性剪接产生多种转录变体。【由RefSeq提供,2013年7月】
DPM1                                                                                                                                    磷酸多糖甘露糖(Dol-P-Man)作为内质网(ER)内腔侧甘露糖基残基的供体。Dol-P-Man的缺乏导致GPI锚定蛋白的表面表达缺陷。Dol-P-Man是由GDP-甘露糖和ER胞质侧的磷酸多糖醇通过磷酸多糖基甘露糖基转移酶合成的。人DPM1缺乏羧基末端跨膜结构域和信号序列,受DPM2调节。该基因的突变与先天性糖基化I型障碍有关。选择性剪接导致多种转录变体。【由RefSeq提供,2015年11月】
SCYL3                                                                                                                                                                                                                                                                         该基因编码一种具有激酶结构域和四个HEAT重复序列的蛋白质。编码的蛋白质与ERM蛋白埃兹林的C末端结构域相互作用,可能在细胞粘附和迁移中发挥作用。选择性剪接导致编码多种异构体的多种转录变体。【由RefSeq于2012年6月提供】
FIRRM                                                                                                                                                                                                                                                                                                         激活蛋白激酶结合活性。参与了几个过程,包括染色体分离;跨线路维修;以及蛋白激酶活性的调节。位于几个细胞组成部分,包括中体;核腔;以及纺锤体中部。[由基因组资源联盟提供,2025年7月]
FGR                                                                                 该基因是蛋白酪氨酸激酶(PTKs)Src家族的成员。编码的蛋白质包含肉豆蔻酰化和棕榈酰化的N末端位点、PTK结构域以及SH2和SH3结构域,它们分别参与介导蛋白质与含磷酸酪氨酸和富含脯氨酸的基序的相互作用。该蛋白定位于质膜褶皱,并作为β2整合素信号转导途径引发的细胞迁移和粘附的负调节因子发挥作用。感染爱泼斯坦-巴尔病毒会导致该基因的过表达。已经鉴定出编码相同蛋白质的多个选择性剪接变体。【由RefSeq提供,2008年7月】
CFH                                                                                                                                                           该基因是补体激活调节因子(RCA)基因簇的成员,编码一种具有20个短共有重复(SCR)结构域的蛋白质。这种蛋白质分泌到血液中,在调节补体激活中起着至关重要的作用,将这种先天防御机制限制在微生物感染中。该基因突变与溶血性尿毒症综合征(HUS)和慢性低补体肾病有关。编码不同异构体的交替转录剪接变体已被表征。【由RefSeq提供,2011年10月】
translate_ggplot(dotplot(x), axis='y')